library("data.table",quietly=TRUE,verbose=FALSE)
#Read big files
library("Biostrings",quietly=TRUE,verbose=FALSE)
#Read fasta files
library("limma",quietly=TRUE,verbose=FALSE)
#make some pretty Venn Diagram
library("ggplot2",quietly=TRUE,verbose=FALSE)
#make some pretty plot
library("viridis",quietly=TRUE,verbose=FALSE)
#make pretty intersection genomic range
library("GenomicRanges",quietly=TRUE,verbose=FALSE)
#Pretty colors
library("DT",quietly=TRUE,verbose=FALSE)
#make some fancy HTML table for HTML output
library("treemap",quietly=TRUE,verbose=FALSE)
#make some graph
library("DiagrammeR",quietly=TRUE,verbose=FALSE)
#make some histogram
library("plotly",quietly=TRUE,verbose=FALSE)
#interactive graphics
library("knitr",quietly=TRUE,verbose=FALSE)
library("kableExtra",quietly=TRUE,verbose=FALSE)
#some table formatting options
source("Utils.R")
Info <- read.table(params$AllLogs,
header=FALSE,
sep="\t",
col.names = c("FLAG","Variable","value"))
#Info file is supposed to be a 3 columns tsv
# is composed of differents logs spawning accros the whole pipeline
# the first column the flag is "INFO" to be able to grep the line
# the column 2 and 3 called Variable and value are similar to a long data frame
#
ProteinBank <- readAAStringSet(params$ProteinBank)
#The file that have been used for the mass spec search
Pep <- fread(params$Pep)
#The peptide Search Using MaxQuant
Rescue <- read.table(params$Rescue,sep="\t",header=TRUE,quote="\"")
#The fasta file of the MrnaEntries that did not have a perfect match in the Whole Uniprot
SEQUENCES_NAMES <- names(ProteinBank)
#Name of the fasta entries in UNIPROT
#cat(params$RefRegexp,"\n")
#cat(params$mRNARegexp,"\n")
#Just to check out the regexp and how it will be interpreted by R
UNIPROT_BOOL <- intersect(grep(",Blast=", SEQUENCES_NAMES, invert=TRUE),
grep(params$RefRegexp, SEQUENCES_NAMES))
#The References names in the Used protein bank
STRINGTIE_BOOL <- grep(params$mRNARegexp,SEQUENCES_NAMES)
#The Mrna names in the Used protein bank
#Classic Venn Diagram trick
#All the names 2 columns, vennCounts, Diagram
SequencesMatrix <- matrix(0, ncol=2, nrow=length(SEQUENCES_NAMES))
rownames(SequencesMatrix) <- SEQUENCES_NAMES
colnames(SequencesMatrix) <- c("Reference", "mRNA")
SequencesMatrix[UNIPROT_BOOL,"Reference"] <- 1
SequencesMatrix[STRINGTIE_BOOL,"mRNA"] <- 1
SequencesCounts <- vennCounts(SequencesMatrix)
# protein names that came from mRNA
PROTEINBANK_MRNA_NAMES <- names(ProteinBank)[grep(params$mRNARegexp, names(ProteinBank))]
# protein names that came from Swissprot
# i.e. either swissprot canonical that are unperfectly present or isoform that had a blast HIT
SWISSPROT.UNMATCHED <- names(ProteinBank)[grep("^sp", names(ProteinBank))]
# Names of the mRNA that have a perfect match with an entry in UNIPROT
PerfectMatch <- PROTEINBANK_MRNA_NAMES[grep(",UNIPROT=", PROTEINBANK_MRNA_NAMES)]
# Names of the mRNA that have a perfect match with an entry in UNIPROT
# that is not a TREMBL
# or and swissprot isoform
PerfectMatch.SP.Canonical <- PerfectMatch[intersect(
grep("-[0-9]+\\|", PerfectMatch, invert=TRUE),
grep("tr\\|", PerfectMatch, invert=TRUE))]
#Isoform perfect match
PerfectMatch.SP.Isoform <- PerfectMatch[intersect(
grep("-[0-9]+\\|", PerfectMatch),
grep("tr\\|", PerfectMatch, invert=TRUE))]
#TrEMBL perfect match
PerfectMatch.TR <- PerfectMatch[grep("tr\\|",PerfectMatch)]
#names of mRNA that do not have a perfect match but a blast hit in UNIPROT
BLAST.Hits <- PROTEINBANK_MRNA_NAMES[grep(",Blast=", PROTEINBANK_MRNA_NAMES)]
BLAST.Hits.SP.Canonical <- BLAST.Hits[intersect(
grep("-[0-9]+\\|", BLAST.Hits, invert=TRUE),
grep(",Blast=tr\\|", BLAST.Hits, invert=TRUE))]
BLAST.Hits.SP.Isoform <- BLAST.Hits[grep("-[0-9]+\\|",BLAST.Hits)]
BLAST.Hits.TR <- BLAST.Hits[grep(",Blast=tr\\|",BLAST.Hits)]
#Those who do not match anything neither in UNIPROT nor BLAST
Unknown <- PROTEINBANK_MRNA_NAMES[intersect(
grep(",UNIPROT=", PROTEINBANK_MRNA_NAMES, invert=TRUE),
grep(",Blast=", PROTEINBANK_MRNA_NAMES, invert=TRUE))]
#Dataframe long format
ProteinBank.Counts<-data.frame(
Match=c(rep("Perfect match", 3), rep("Blast hit", 3), "unknown"),
Form=c(rep(c("Canonical","Isoform","TrEMBL"),2), "unknown"),
Effectif=c(length(PerfectMatch.SP.Canonical), length(PerfectMatch.SP.Isoform), length(PerfectMatch.TR), length(BLAST.Hits.SP.Canonical),
length(BLAST.Hits.SP.Isoform), length(BLAST.Hits.TR), length(Unknown))
)
#The names of the proteins that are related to UNIPROT either associated to an mRNA or reinjected.
UNIPROT_NAMES <- names(ProteinBank)[grep(params$RefRegexp, names(ProteinBank))]
UP_MATCH <- grep(",UNIPROT=", UNIPROT_NAMES)
UP_BLAST <- grep(",Blast=", UNIPROT_NAMES)
UP_Access_ID <- unlist(lapply(UNIPROT_NAMES, function(x){ unlist( strsplit(x, split="\\|"))[2] }))
UNIPROT_Presence.Matrix <- matrix(0, nrow=length(UP_Access_ID), ncol=2)
rownames(UNIPROT_Presence.Matrix)<-UP_Access_ID
colnames(UNIPROT_Presence.Matrix)<-c("Match","Blast")
UNIPROT_Presence.Matrix[UP_MATCH,"Match"]<-1
UNIPROT_Presence.Matrix[UP_BLAST,"Blast"]<-1
UNIPROT_Presence.DF<-data.frame(UNIPROT_Presence.Matrix)
UNIPROT_Presence.DF$Uniprot_Entry<-UP_Access_ID
UNIPROT_Presence.collapsed.DF<-aggregate(UNIPROT_Presence.DF[ ,c("Match","Blast")],
by=list(UNIPROT_Presence.DF$Uniprot_Entry), FUN=sum)
UNIPROT_Presence.collapsed.counts<-vennCounts(UNIPROT_Presence.collapsed.DF[,c("Match","Blast")])
Protein bank file: /data/tmp/ltaing/WorkOnSplicifyData/IsoAndSpe-master/Sequences/UniprotAndProteinBank.TwoPass.KnownAndMostLikely.BlastPatched.LookalikeInUniprotNoSP.fasta, obtained from the Two Pass protocol.
Peptide file: /data/users/ltaing/DATA_TMP/ltaing/WorkOnSplicifyData/IsoAndSpe-master/Spectro/TwoPass/MQ_RES/peptides.txt, MaxQuant peptide file using the Protein bank file.
Blast file: /data/tmp/ltaing/WorkOnSplicifyData/IsoAndSpe-master/Sequences/TwoPass.Rescue.tsv, Blast hit file of the mRNA proteins that dont have a perfect match on the whole uniprot database
Rationale for the protein bank construction.
The idea is to build a protein bank that is the closest to the mRNA rna-seq results. In this present study we will focus mainly on Isoform. From these isoforms and also UNIPROT knowlegde, we can produce a protein bank composed of:
D: The main part are the mRNA with an ORF that match perfectly, meaning that those proteins are present in both the mRNA and also the UNIPROT databank (either canonical, isoforms or TrEMBL)
E: The Swissprot Canonical part consist of Canonical entries that did not perfectly match with the CDS from the MRNA. Since the swissprot Canonical would have constitute the regular database for the mass spectrometry search, we consider it necessary to include all the missing ones of our approach.
C and B: The blast parts consists of the longuest ORFS from the mRNA that once translated do not have a perfect match in UNIPROT, but have a blast hit (C). Based on previous observations and experiment, it has been decided that theses LORFs would be put in competition with their Subject hits in UNIPROT (D) to overcome some flaws in the mRNA part.
A: The LORF that neither match nor have a blast hit with anything.
#Some pretty colors
GraphPalette<-viridis(4,alpha=0.6)
mRNAColor<-GraphPalette[1]
UniprotColor<-GraphPalette[2]
ActionColor<-GraphPalette[3]
ProteinBankColor<-GraphPalette[4]
#Get the numbers
N_mRNA<-as.numeric(as.vector(Info[Info$Variable=="mRNA_length","value"]))
N_ORFS<-as.numeric(as.vector(Info[Info$Variable=="All_ORFs_non_disjoint","value"]))
N_no_match<-as.numeric(as.vector(Info[Info$Variable=="LORFs_without_UNIPROT_perfect_match","value"]))
N_UNIPROT<-as.numeric(as.vector(Info[Info$Variable=="UNIPROT_length","value"]))
N_SP_whithout_perfect_match<-as.numeric(as.vector(Info[Info$Variable=="Swissprot_canonical_without_mRNA","value"]))
N_canonical<-as.numeric(as.vector(Info[Info$Variable=="Swissprot","value"]))
N_isoform<-as.numeric(as.vector(Info[Info$Variable=="Isoforms","value"]))
N_TrEMBL<-as.numeric(as.vector(Info[Info$Variable=="TrEMBL","value"]))
N_ProteinBank<-as.numeric(as.vector(Info[Info$Variable=="Protein_Bank","value"]))
N_BLAST_hit<-as.numeric(as.vector(Info[Info$Variable=="Blast_Hits","value"]))
N_unknown<-length(Unknown)
N_match<-length(UP_MATCH)
N_Isoform_TrEBML_BLAST_hit<-as.numeric(as.vector(Info[Info$Variable=="Uniprot_add_suspect","value"]))
#DiagrammeR stuff
# presentation https://rich-iannone.github.io/DiagrammeR/graphviz_and_mermaid.html
# options https://graphviz.gitlab.io/_pages/doc/info/attrs.html
# From what I understood
# GrViz will interpret some text to produce the graph in a HTML widget
# - To include the numbers, we make a string with every informations, the numbers are formated to be easier to read
# shape, fontname, style, fontsize, width, height,fillcolor and label are self explainatory
# overlap : Not so sure but supposed to be nodes overlap
# rankdir : General orientation of the graph LR Left to Right
# splines : how the edges should be drawn
#
# keywords: graph, node, edge
# [] attributes/values "," separated
# nodes are ; separated
# edge are space separated
# node "From" name -> node "To" name
#
# Ranks part
# in the end of the graph declaration
# the ranks are some kind of mapping of the graph
# for example
# { rank=same; A; B; C; 2; }
# means that nodes A B C and 2 share the same Left Right alignement
#
# width 768 to fit the final html file
# height 432 to keep a 16:9 ratio
#
grViz(paste("
digraph boxes_and_circles {
# a 'graph' statement
graph [overlap = true,
fontsize =24,
rankdir = LR,
splines=ortho]
# several 'node' statements
node [shape = box,
fontname = Helvetica,
style = filled,
fontsize =24.0]
E[label=\"mRNA\nn: ", formatC(N_mRNA, big.mark=" ", format="d"), "\", fillcolor=\"", mRNAColor, "\",width=2,height=3];
F[label=\"All ORFs\nn: ", formatC(N_ORFS, big.mark=" ", format="d"), "\", fillcolor=\"", mRNAColor, "\",width=2,height=3];
I[label=\"mRNA\nwithout a ORF\nthat match\nperfectcly\nn: ",formatC(N_no_match, big.mark=" ", format="d"),"\",fillcolor=\"", mRNAColor, "\"];
J[label=\"Longuest\nORF\nper mRNA\nn: ", formatC(N_no_match, big.mark=" ", format="d"),"\",fillcolor=\"", mRNAColor, "\"];
A[label=\"Swissprot\ncanonical\nn: ", formatC(N_canonical, big.mark=" ", format="d"), "\",fillcolor=\"", UniprotColor, "\",width=3];
B[label=\"Swissprot\nisoform\nn: ", formatC(N_isoform, big.mark=" ", format="d"), "\",fillcolor=\"", UniprotColor, "\",width=3];
C[label=\"TrEMBL\nn: ", formatC(N_TrEMBL, big.mark=" ", format="d"), "\",fillcolor=\"", UniprotColor, "\",width=3];
D[label=\"UNIPROT\nn: ", formatC(N_UNIPROT, big.mark=" ", format="d"), "\",fillcolor=\"", UniprotColor, "\",width=3,height=3];
M[label=\"Protein bank\nfor\nMaxQuant\nn: ", formatC(N_ProteinBank, big.mark=" ", format="d"), "\",fillcolor=\"", ProteinBankColor, "\", height=9]
K[label=\"C: LORF\nwith\na blast hit\nn: ", formatC(N_BLAST_hit, big.mark=" ", format="d"), "\",fillcolor=\"", mRNAColor, "\",width=3.5];
L[label=\"A: Unknown\nLORF\nn: ", formatC(N_unknown, big.mark=" ", format="d"), "\",fillcolor=\"", mRNAColor, "\",width=3.5];
H[label=\"D: mRNA\nwith a ORF\nthat\nmatch perfectly\nn: ", formatC(N_match,big.mark=" ",format="d"),
"\",fillcolor=\"", UniprotColor, "\", width=3.5];
G[label=\"B: Isoform\nTrEmbl\nBlast HIT\nn: ", formatC(N_Isoform_TrEBML_BLAST_hit, big.mark=" ", format="d"), "\", fillcolor=\"", UniprotColor, "\", width=3.5];
N[label=\"E: Swissprot Canonical\nwithout\na perfect Match\nn: ", formatC(N_SP_whithout_perfect_match, big.mark=" ", format="d"), "\", fillcolor=\"", UniprotColor, "\", width=3.5];
node [shape = plaintext,fillcolor=\"", ActionColor, "\"];
1[label=\"Concat\nkeep only\nunique\nsequence\",height=3]; 2[label=\"Find ORFs\"];
3[label=\"Perfect\nMatch\",height=4]; 4[label=\"Select\"]; 5[label=\"BLAST\",height=4]
# several 'edge' statements
edge [len=2.0]
E->2 2->F F->3 3->I 4->J J->5 I->4 A->1 B->1 C->1 1->D D->3 D->5 5->K 5->G 5->L L->M G->M K->M H->M N->M 3->H 3->N
{ rank=same; A; B; C; 2; }
{ rank=same; F; 1; }
{ rank=same; D; 3; }
{ rank=same; I;}
{ rank=same; L; K; G; H; N}
}"),width=768,height=432)
The protein bank file contains 65 239 proteins.
The reference sequences are characterized by the “(sp|)|(tr|)” regular expression.
The mRNA sequences are characterized by the “strngt” regular expression.
vennDiagram(SequencesCounts,
main="Perfect match",
names=c("From UNIPROT","From mRNA"))
Length histogram of the protein of the bank.
#plotly histogram
plot_ly(x =~ width(ProteinBank),
type="histogram")
The treemap represent the 46 862 proteins from the mRNAs of the protein bank. They are divided in 3 categories:
MATCH: those who match perfectly an UNIPROT entry (n=25 370)
BLAST: those who do not match perfectly an UNIPROT entry but share a BLAST hit. The blast hit range from part of the UNIPROT entry to a single mismatch (n=19 279)
UNKNOWN: those who neither match an UNIPROT entry nor share a BLAST hit (n=2 213)
treemap::treemap(ProteinBank.Counts,
index=c("Match","Form"),
vSize = "Effectif",
vColor="Form",
type="index")
From all the UNIPROT entries in the database, how do they interact with our mRNAs? Do they match perfectly some of our mRNA or do they have a BLAST hit?
vennDiagram(UNIPROT_Presence.collapsed.counts,
"Uniprot Coverage")
The protein bank obtained from the mRNA pipeline is then used for proteomic analysis. The spectrum have been searched with MaxQuant.
We focus here on the leading razor protein. The leading razor protein is the protein that MaxQuant consider as the most likely when a peptide match several proteins or isoforms, based on the numer of peptide matching the protein.
This is the repartition of the leading razor proteins:
Contaminants are decoy artificially created by MaxQuant
Only in Uniprot are proteins from Uniprot protein bank
Both in UNIPROT and mRNA are the protein that we found both in UNIPROT and also from mRNA
Only in mRNA are the protein that we found only in mRNA
#Get the contaminant
ContamsInMS<-unique(Pep[grep("(CON|REV)__",`Leading razor protein`),`Leading razor protein`])
#Discard the contaminant
ProteinsInMS.NoContam<-unique(Pep[grep("(CON|REV)__",`Leading razor protein`,invert=TRUE),`Leading razor protein`])
#How many
N.ContamsInMS<-length(ContamsInMS)
#The idea is that if the entry is only present in SwissProt it will be featured as the first on the list, it doesn't matter if it match any TR or any oter swissprot isoform
ProteinsInMS.NoContam.SwissprotOnly<-ProteinsInMS.NoContam[grep("^sp\\|",ProteinsInMS.NoContam)]
N.Uniprot<-length(ProteinsInMS.NoContam.SwissprotOnly)
#Leading Razor Protein that match MRNA
ProteinsInMS.NoContam.Mrna<-ProteinsInMS.NoContam[grep("^strngt",ProteinsInMS.NoContam)]
#Of those who matched the mrna how many have a perfect match in the Uniprot database
ProteinsInMS.NoContam.Mrna.Both<-ProteinsInMS.NoContam.Mrna[grep("UNIPROT=",ProteinsInMS.NoContam.Mrna)]
N.Both<-length(ProteinsInMS.NoContam.Mrna.Both)
#Of those who match the Mrna how many do not match anything in UNIPROT
ProteinsInMS.NoContam.Mrna.MrnaOnly<-ProteinsInMS.NoContam.Mrna[grep("UNIPROT=",ProteinsInMS.NoContam.Mrna,invert=TRUE)]
N.Mrna<-length(ProteinsInMS.NoContam.Mrna.MrnaOnly)
#Some Unpretty representation of our Leading Razon Protein Situation
df <- data.frame(
group = factor(
c("Contaminants", "Only in Uniprot", "Both in UNIPROT and MRNA", "Only in MRNA"),
levels=c("Contaminants", "Only in Uniprot", "Both in UNIPROT and MRNA", "Only in MRNA")),
Effectif=c(N.ContamsInMS, N.Uniprot, N.Both, N.Mrna))
datatable(df, rownames = FALSE, options=list(dom='tir'))
g<-ggplot(df, aes(x = "", y = Effectif, fill = group)) + geom_col(width = 1) + scale_fill_manual(values = viridis(4)) +
labs(title = "Two pass/Leading Razon protein") + theme_bw()
print(g)
Of all the 8289 leading razor protein, what kind of uniprot entry are they? and where do they come from?
Uniprot entry:
The canonical correspond to the usual database for proteomic analysis. They consist of manually curated sequences.
The Isoform correspond to other version canonical entries, alternatives sequences, Insertions/Deletions.
The TrEMBL consist of automaticaly annotated proteins.
Origin:
mRNA Perfect Match corresponds to Uniprot entries with an open reading frame from a mRNA that once translated match perfectly to an Uniprot entry.
Reinjected corresponds to the Uniprot Blast hit of the longuest open reading frame from a mRNA that once translated match unperfectly to an Uniprot entry. That Uniprot entry is then reinjected to the Protein Bank. It will be put in competition with the longuest ORF. We also decided to to reinject the Uniprot canonical entries that were not present in the protein bank as a precaution.
mRNA blast correspond to protein obtained from the longuest orf that match unperfectly and is still choosen over it’s competition.
unknown corresponds to longuest open reading frame that does not match anything in Uniprot.
ProteinsInMS.NoContam.Swissprot <- ProteinsInMS.NoContam[ grep("^sp\\|", ProteinsInMS.NoContam)]
ProteinsInMS.NoContam.Swissprot.canonical <- ProteinsInMS.NoContam.Swissprot[ grep("^sp\\|[^-]*\\|", ProteinsInMS.NoContam.Swissprot)]
ProteinsInMS.NoContam.Swissprot.Iso <- ProteinsInMS.NoContam.Swissprot[ grep("^sp\\|[^-]*\\|", ProteinsInMS.NoContam.Swissprot, invert=TRUE)]
ProteinsInMS.NoContam.TR <- ProteinsInMS.NoContam[ grep("^tr\\|", ProteinsInMS.NoContam)]
ProteinsInMS.NoContam.mRNA <- ProteinsInMS.NoContam[ grep("^st", ProteinsInMS.NoContam)]
ProteinsInMS.NoContam.mRNA.PerfectMatch <- ProteinsInMS.NoContam.mRNA[grep("UNIPROT=", ProteinsInMS.NoContam.mRNA)]
ProteinsInMS.NoContam.mRNA.PerfectMatch.canonical <- ProteinsInMS.NoContam.mRNA.PerfectMatch[ grep( "UNIPROT=sp\\|[^-]*\\|", ProteinsInMS.NoContam.mRNA.PerfectMatch)]
ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso <- ProteinsInMS.NoContam.mRNA.PerfectMatch[ grep( "UNIPROT=sp\\|[^-]*-[^-]*\\|", ProteinsInMS.NoContam.mRNA.PerfectMatch)]
ProteinsInMS.NoContam.mRNA.PerfectMatch.TR <- ProteinsInMS.NoContam.mRNA.PerfectMatch[ grep( "UNIPROT=tr\\|", ProteinsInMS.NoContam.mRNA.PerfectMatch)]
ProteinsInMS.NoContam.mRNA.LookAlike <- ProteinsInMS.NoContam.mRNA[ grep("Blast=", ProteinsInMS.NoContam.mRNA)]
ProteinsInMS.NoContam.mRNA.LookAlike.canonical <- ProteinsInMS.NoContam.mRNA.LookAlike[ grep("Blast=sp\\|[^-]*\\|", ProteinsInMS.NoContam.mRNA.LookAlike)]
ProteinsInMS.NoContam.mRNA.LookAlike.Iso <- ProteinsInMS.NoContam.mRNA.LookAlike[ grep("Blast=sp\\|[^-]*-[^-]*\\|", ProteinsInMS.NoContam.mRNA.LookAlike)]
ProteinsInMS.NoContam.mRNA.LookAlike.TR <- ProteinsInMS.NoContam.mRNA.LookAlike[ grep("Blast=tr\\|", ProteinsInMS.NoContam.mRNA.LookAlike)]
ProteinsInMS.NoContam.mRNAOnly<-ProteinsInMS.NoContam.mRNA[ intersect(
grep("UNIPROT=", ProteinsInMS.NoContam.mRNA, invert=TRUE),
grep("Blast=", ProteinsInMS.NoContam.mRNA, invert=TRUE))
]
LeadingRazor.DF<-data.frame(
Origin=c(rep("Reinjected",3), rep("mRNAPerfectMatch",3), rep("mRNABlast",3), "mRNA"),
Database=c(rep( c("canonical","isoform","TrEMBL"),3),"unknown"),
Effectif=rep(0,10))
LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" & LeadingRazor.DF$Database=="canonical", "Effectif"] <- length(ProteinsInMS.NoContam.Swissprot.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" & LeadingRazor.DF$Database=="isoform", "Effectif"] <- length(ProteinsInMS.NoContam.Swissprot.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" & LeadingRazor.DF$Database=="TrEMBL", "Effectif"] <- length(ProteinsInMS.NoContam.TR)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" & LeadingRazor.DF$Database=="canonical", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.PerfectMatch.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" & LeadingRazor.DF$Database=="isoform", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" & LeadingRazor.DF$Database=="TrEMBL", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.PerfectMatch.TR)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" & LeadingRazor.DF$Database=="canonical", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.LookAlike.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" & LeadingRazor.DF$Database=="isoform", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.LookAlike.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" & LeadingRazor.DF$Database=="TrEMBL", "Effectif"] <- length(ProteinsInMS.NoContam.mRNA.LookAlike.TR)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNA" & LeadingRazor.DF$Database=="unknown", "Effectif"] <- length(ProteinsInMS.NoContam.mRNAOnly)
datatable(LeadingRazor.DF, rownames = FALSE, options=list(dom='tir'))
treemap(LeadingRazor.DF,
index=c("Database","Origin"),
vSize = "Effectif",
type="index",
align.labels = list(c("center","center"),c("left","top")))
We isolated the leading razor protein identified as isoforms with at least a proteotypic peptide, meaning a peptide that match only one protein.
Each line of this table represent a Leading razor protein with at least one proteotypic peptide. In these tables we present proteins obtained from our mrna and that are similar to the sequence in the whole UNIPROT protein bank.
Column names:
More informations can be diplayed by clicking on the green button. To hide these informations, you can click on the red button.
Name: Name of the Leading Razor Protein, it will contain the name of the mRNA followed by informations regarding the the ORF on the mRNA. It will also contains informations on it’s UNIPROT counterpart.
N_Peptides: The number of peptides that match this protein.
Match_Proteins: Every single protein matched by the present peptides. A peptide sequence can match either one protein in the protein bank, the peptide is then called a proteotypic, or it can match several proteins. The match proteins are the list of every single proteins match by these peptides. Since we are focussing on isoforms, we should found the canonical form among the match proteins.
N_Match_Proteins: The number of distincts proteins matchs by all the present peptides.
PEPTIDES: Table of each peptide matching this protein. Each line in this subtable is a peptide that has been attributed to this very leading razor protein.
Start: Begining of the match on the protein sequence
End: End of the match on the protein sequence
Pep: Confidence peptide score as calculated by MaxQuant, similar to a p-value
Proteins: Possibles proteins match for this very peptide. The matched proteins are displayed in short form. Either the mRNA is displayed when it cames from a mRNAblast hit or the UNIPROT entry name when their is a perfect match.
Sequence: Peptide sequence, as interpreted by MaxQuant.
Sequence: Leading razor protein sequence displayed in HTML enriched form. The sequence of the leading razor protein is displayed for a quick overview of it’s mass spec results. Graphical emphasis is put on the peptide match, the blast shared zone of the leading razor protein and also the differences with the UNIPROT entry’s sequence. If you hover the mouse cursor over the sequence, it should display any pertinent information relative to the sequence at this particular position.
Sequence description
Peptide formalism: EXAMPLE
Proteotypic peptide formalism: EXAMPLE
Isoform specific sequence: EXAMPLE
Isoforms <-
c(
ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso,
ProteinsInMS.NoContam.Swissprot.Iso
)
Isoforms.Proteotypic <- Isoforms[Isoforms %in% Pep$Proteins]
names(ProteinBank) <- gsub(" .*", "", names(ProteinBank))
Pep.Iso.List <-
lapply(Isoforms.Proteotypic, function(x, Pep, ProteinBank) {
DATAFRAME_MakeALineForAIsoProt(
LeadingRazorProtein = x,
DF_Peptides = Pep,
BS_ProteinBank = ProteinBank,
80
)
}, Pep, ProteinBank)
Pep.Iso.data.table <- bind_rows (Pep.Iso.List)
DT::datatable(
Pep.Iso.data.table,
rownames = FALSE,
escape = FALSE,
options = list(dom='ftipr'),
extensions = 'Responsive'
)
152 455 peptides have been identified with 2 457 788 MS/MS.
PEP.PEP<-unlist(Pep[,"PEP"])
names(PEP.PEP)<-NULL
PEP.PEP[PEP.PEP<=1e-300]<-1e-300
hist(-log(PEP.PEP,10),
breaks=seq(0,300,1),
xlab="-log_10(PEP)",
ylab="Frequency",
main="PEP score repartition - histogram")
plot(ecdf(-log(PEP.PEP,10)),
xlab="-log_10(PEP)",
main="PEP score repartition - empirical cumulative distribution")
This part is focus on the protein of the bank that cames from the mRNA that do not have a perfect match.
Even though they do not have a perfect match in UNIPROT they might have a blast hit on UNIPROT.
Each line of this table represent a Leading razor protein with at least one proteotypic peptide. In these tables we present proteins obtained from our mrna and that are similar to the sequence in the whole UNIPROT protein bank.
Column names:
More informations can be diplayed by clicking on the green button. To hide these informations, you can click on the red button.
Name: Name of the mRNA, the complementary information of the mRNA regarding the ORF have been supressed for more lisibility. By extension, the name of the mRNA will be used as the name of its resulting ORF. The choosen ORF among all ORFs available on a mRNA is the longuest one with a mathch. If there is no Blast Hit, the longuest ORF is then used.
Blast Hit: For a given mRNA, if blast match a sequence in the whole UNIPROT database, the blast hit is the name of this sequence. In case sevral sequences are matched, the one with the best score is considered as the best match and is the only one displayed. Sometimes, you can encounter 2 or more stacked UNIPROT entries, this coorespond to sequence redundancy between the differents sub bank in UNIPROT.
N_Peptides: The number of peptides that match this protein.
Match_Proteins: Every single protein matched by the present peptides. A peptide sequence can match either one protein in the protein bank, the peptide is then called a proteotypic, or it can match several proteins. The match proteins are the list of every single proteins match by these peptides.
N_Match_Proteins: The number of distincts proteins matchs by all the present peptides.
PEPTIDES: Table of each peptide matching this protein. Each line in this subtable is a peptide that has been attributed to this very leading razor protein.
Start: Begining of the match on the protein sequence
End: End of the match on the protein sequence
Pep: Confidence peptide score as calculated by MaxQuant, similar to a p-value
Proteins: Possibles proteins match for this very peptide. The matched proteins are displayed in short form. Either the mRNA is displayed when it cames from a mRNAblast hit or the UNIPROT entry name when their is a perfect match.
Sequence: Peptide sequence, as interpreted by MaxQuant.
Sequence: Leading razor protein sequence displayed in HTML enriched form. The sequence of the leading razor protein is displayed for a quick overview of it’s mass spec results. Graphical emphasis is put on the peptide match, the blast shared zone of the leading razor protein and also the differences with the UNIPROT entry’s sequence. If you hover the mouse cursor over the sequence, it should display any pertinent information relative to the sequence at this particular position.
Sequence description
Peptide formalism: EXAMPLE
Proteotypic peptide formalism: EXAMPLE
Blast formalism: EXAMPLE
Missmatch formalism: EXAMPLE
Insertion formalism: EXAMPLE
Deletion formalism: EXAMPLE